In [1]:
import re
from gensim import models
from scipy import spatial
import numpy as np
import os.path
import urllib
import gzip
import json
In [2]:
def search_tags(entity, search):
"""
This function searches through all the 'tags' (semantic content) of a data set
and returns 'true' if the search expression is found. case insensitive.
"""
all_tags = '; '.join([str(x) for x in entity['tags'].values()])
return bool(re.search(search, all_tags, flags=re.IGNORECASE))
In [3]:
def gunzipFile(inFileName, outFileName):
inF = gzip.open(inFileName, 'rb')
outF = open(outFileName, 'wb')
outF.write( inF.read() )
inF.close()
outF.close()
In [7]:
def jaccardDistance(sent1, sent2, stoplist):
sent1 = re.sub('[^0-9a-zA-Z]+', ' ', sent1)
sent2 = re.sub('[^0-9a-zA-Z]+', ' ', sent2)
tokens1 = [word for word in sent1.replace("…", " ").lower().split() if word not in stoplist]
tokens2 = [word for word in sent2.replace("…", " ").lower().split() if word not in stoplist]
# subtract from 1, so that 0 means all words in common and 1 means no words in common
jaccardIndex = 1.0 - float(len(set.intersection(set(tokens1), set(tokens2)))) / float(len(set.union(set(tokens1), set(tokens2))))
return(jaccardIndex)
Load in the stopwords file. These are common words which we wish to exclude when performing comparisons (a, an, the, etc). Every word is separated by a new line.
In [5]:
stopWordsFile = "en.txt"
with open(stopWordsFile) as f:
stoplist = [x.strip('\n') for x in f.readlines()]
Load in the data from the catalog
In [6]:
# http://stackoverflow.com/questions/956867/how-to-get-string-objects-instead-of-unicode-ones-from-json-in-python
# need this to deal with unicode errors
def byteify(input):
if isinstance(input, dict):
return {byteify(key): byteify(value)
for key, value in input.iteritems()}
elif isinstance(input, list):
return [byteify(element) for element in input]
elif isinstance(input, unicode):
return input.encode('utf-8')
else:
return input
gunzipFile('../catalogs/gabi_2016_professional-database-2016.json.gz',
'../catalogs/gabi_2016_professional-database-2016.json')
gunzipFile('../catalogs/uslci_ecospold.json.gz',
'../catalogs/uslci_ecospold.json')
with open('../catalogs/gabi_2016_professional-database-2016.json') as data_file:
gabi = json.load(data_file, encoding='utf-8')
with open('../catalogs/uslci_ecospold.json') as data_file:
uslci = json.load(data_file, encoding='utf-8')
gabi = byteify(gabi)
uslci = byteify(uslci)
In [10]:
roundwood = [flow for flow in uslci['flows'] if search_tags(flow,'roundwood, softwood')]
roundwoodExample = roundwood[0]
# number of top scores to show
numTopScores = 10
flowNames = []
distValues = []
for flow in gabi['archives'][0]['flows']:
name = flow['tags']['Name']
flowNames.append(name)
dist = jaccardDistance(roundwoodExample['tags']['Name'], name, stoplist)
distValues.append(dist)
len(flowNames)
# figure out top scores
arr = np.array(distValues)
topIndices = arr.argsort()[0:numTopScores]
topScores = np.array(distValues)[topIndices]
print 'Process name to match:'
print roundwoodExample['tags']['Name']
print 'Matches using Jaccard Index:'
for i, s in zip(topIndices, topScores):
if s < 9999:
print(flowNames[i],s)
In [ ]: